library(tidyverse)
library(openintro)
library(infer)
library(psych)
library(gghighlight)yrbss %>%
summary## age gender grade hispanic
## Min. :12.00 Length:13583 Length:13583 Length:13583
## 1st Qu.:15.00 Class :character Class :character Class :character
## Median :16.00 Mode :character Mode :character Mode :character
## Mean :16.16
## 3rd Qu.:17.00
## Max. :18.00
## NA's :77
## race height weight helmet_12m
## Length:13583 Min. :1.270 Min. : 29.94 Length:13583
## Class :character 1st Qu.:1.600 1st Qu.: 56.25 Class :character
## Mode :character Median :1.680 Median : 64.41 Mode :character
## Mean :1.691 Mean : 67.91
## 3rd Qu.:1.780 3rd Qu.: 76.20
## Max. :2.110 Max. :180.99
## NA's :1004 NA's :1004
## text_while_driving_30d physically_active_7d hours_tv_per_school_day
## Length:13583 Min. :0.000 Length:13583
## Class :character 1st Qu.:2.000 Class :character
## Mode :character Median :4.000 Mode :character
## Mean :3.903
## 3rd Qu.:7.000
## Max. :7.000
## NA's :273
## strength_training_7d school_night_hours_sleep
## Min. :0.00 Length:13583
## 1st Qu.:0.00 Class :character
## Median :3.00 Mode :character
## Mean :2.95
## 3rd Qu.:5.00
## Max. :7.00
## NA's :1176
yrbss %>%
count(text_while_driving_30d)## # A tibble: 9 × 2
## text_while_driving_30d n
## <chr> <int>
## 1 0 4792
## 2 1-2 925
## 3 10-19 373
## 4 20-29 298
## 5 3-5 493
## 6 30 827
## 7 6-9 311
## 8 did not drive 4646
## 9 <NA> 918
getProportions <- function(data) {
data <- data %>%
select(text_ind) %>%
count(text_ind) %>%
mutate(p = n / sum(n)) %>%
select(text_ind, n, p)
print(data)
return(data)
}neverType = "never"
type30 = "30"
yesLabel = "yes"
noLabel = "no"
nLabel = "n"
no_helmet <- yrbss %>%
filter(helmet_12m == neverType) %>%
mutate(text_ind = ifelse(!is.na(text_while_driving_30d)
& text_while_driving_30d == type30, yesLabel, noLabel))
no_helmetProportions <- no_helmet %>%
getProportions## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 6514 0.934
## 2 yes 463 0.0664
#overall <- yrbss %>%
# mutate(text_ind = ifelse(!is.na(text_while_driving_30d)
# & text_while_driving_30d == type30
# & !is.na(helmet_12m)
# & helmet_12m == neverType, yesLabel, noLabel)) %>%
# getProportionsanalyzeConfidenceInterval <- function(sample, reps = 1000, yes = TRUE, level = 0.95, print = TRUE) {
interval <- sample %>%
specify(response = text_ind, success = ifelse (yes, yesLabel, noLabel)) %>%
generate(reps = reps, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = level)
if (print) {
print(interval)
print(c(interval$upper_ci, interval$lower_ci) %>%
describe)
}
return(interval)
}analyzeSamplingProportionDistribution <- function(sample, size = NULL, reps = 15000, binwidth = .01, yes = TRUE) {
size <- ifelse(is.null(size), nrow(sample), size)
sizedSamples <- sample %>%
rep_sample_n(size = size, reps = reps, replace = TRUE) %>%
count(text_ind) %>%
mutate(p_hat = n /sum(n))
typeLabel <- ifelse(yes, yesLabel, notLabel)
filteredSamples <- sizedSamples %>%
filter(text_ind == typeLabel)
print(ggplot(data = filteredSamples, aes(x = p_hat)) +
geom_histogram(binwidth = binwidth) +
labs(
x = paste("p_hat (", typeLabel, ")", sep = ""),
title = "Sampling distribution of p_hat",
subtitle = paste("Sample size = ", size, "; Number of samples = ", reps, "; Bin width = ", binwidth, sep = "")
))
print(filteredSamples)
print(filteredSamples$p_hat %>%
describe)
print(sizedSamples$n %>%
sum)
return(sizedSamples)
}analyzeMarginOfError <- function(n, p, decimals = 3) {
me <- 2 * sqrt(p * (1 - p) / n)
print(paste("Margin of Error: ", me, sep = ""))
binwidth = 1/ 10 ^ decimals
pList <- seq(from = 0, to = 1, by = binwidth)
meList <- 2 * sqrt(pList * (1 - pList) / n)
dd <- data.frame(proportion = pList, marginOfError = meList)
print(ggplot(data = dd, aes(x = proportion, y = marginOfError)) +
geom_point() +
gghighlight(proportion == round(p, digits = decimals), label_key = marginOfError) +
labs(x = "Population Proportion",
y = "Margin of Error",
title = paste("Sample size = ", n, "; Proportion = ", p, "; Bin width = ", binwidth, sep = "")))
return(me)
}
analyzeMarginOfErrorData <- function(data, row = "yes", size = NULL, reps = 15000, binwidth = .01, yes = TRUE, distribute = TRUE) {
filteredData <- data %>%
getProportions %>%
filter(text_ind == row)
me <- analyzeMarginOfError(filteredData[[2]], filteredData[[3]])
if(distribute) {
proportionDistribution <- data %>%
analyzeSamplingProportionDistribution(size, reps, binwidth, yes)
}
return(me)
}interval95 <- no_helmet %>%
analyzeConfidenceInterval## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0606 0.0727
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 0.07 0.01 0.07 0.07 0.01 0.06 0.07 0.01 0 -2.75 0.01
print(no_helmet)## # A tibble: 6,977 × 14
## age gender grade hispanic race height weight helmet_12m text_while_driv…
## <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 14 female 9 not Black … NA NA never 0
## 2 14 female 9 not Black … NA NA never <NA>
## 3 15 female 9 hispanic Native… 1.73 84.4 never 30
## 4 15 female 9 not Black … 1.6 55.8 never 0
## 5 14 male 9 not Black … 1.88 71.2 never <NA>
## 6 15 male 9 not Black … 1.75 63.5 never <NA>
## 7 16 male 9 not Black … 1.68 74.8 never 0
## 8 14 male 9 not Black … 1.73 73.5 never did not drive
## 9 15 male 9 not Black … 1.83 67.6 never 0
## 10 16 male 9 not Black … 1.83 73.5 never did not drive
## # … with 6,967 more rows, and 5 more variables: physically_active_7d <int>,
## # hours_tv_per_school_day <chr>, strength_training_7d <int>,
## # school_night_hours_sleep <chr>, text_ind <chr>
me <- no_helmet %>%
analyzeMarginOfErrorData(distribute = FALSE)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 6514 0.934
## 2 yes 463 0.0664
## [1] "Margin of Error: 0.0231358334802707"
The resulting distribution is unimodal and without a skew centered around the given proportion
numOfElements = 300
proportion = .1
data = tibble(
text_ind = c(rep(yesLabel, numOfElements * proportion), rep(noLabel, numOfElements * (1 - proportion)))
)
me <- data %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 270 0.9
## 2 yes 30 0.1
## [1] "Margin of Error: 0.109544511501033"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 26 0.0867
## 2 2 yes 27 0.09
## 3 3 yes 39 0.13
## 4 4 yes 28 0.0933
## 5 5 yes 37 0.123
## 6 6 yes 29 0.0967
## 7 7 yes 26 0.0867
## 8 8 yes 30 0.1
## 9 9 yes 39 0.13
## 10 10 yes 30 0.1
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.1 0.02 0.1 0.1 0.02 0.04 0.18 0.13 0.15 -0.03 0
## [1] 4500000
The shape and center remains fairly consistent as unimodal and without a skew centered around the given proportion, but the margin of error increases starting from 0 at 0% proportion until it reaches 50% where it achieves its max before decreasing back down to 0 at 100%
minisculeProportion = .01
tinyProportion = .2
smallProportion = .40
mediumProportion = .60
largeProportion = .80
massiveProportion = .99
smallBinwidth = .001
minisculeData = tibble(
text_ind = c(rep(yesLabel, numOfElements * minisculeProportion), rep(noLabel, numOfElements * (1 - minisculeProportion)))
)
tinyData = tibble(
text_ind = c(rep(yesLabel, numOfElements * tinyProportion), rep(noLabel, numOfElements * (1 - tinyProportion)))
)
smallData = tibble(
text_ind = c(rep(yesLabel, numOfElements * smallProportion), rep(noLabel, numOfElements * (1 - smallProportion)))
)
mediumData = tibble(
text_ind = c(rep(yesLabel, numOfElements * mediumProportion), rep(noLabel, numOfElements * (1 - mediumProportion)))
)
largeData = tibble(
text_ind = c(rep(yesLabel, numOfElements * largeProportion), rep(noLabel, numOfElements * (1 - largeProportion)))
)
massiveData = tibble(
text_ind = c(rep(yesLabel, numOfElements * massiveProportion), rep(noLabel, numOfElements * (1 - massiveProportion)))
)
me <- minisculeData %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 297 0.99
## 2 yes 3 0.01
## [1] "Margin of Error: 0.114891252930761"
## # A tibble: 14,285 × 4
## # Groups: replicate [14,285]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 4 0.0133
## 2 2 yes 1 0.00333
## 3 3 yes 3 0.01
## 4 4 yes 2 0.00667
## 5 5 yes 5 0.0167
## 6 6 yes 4 0.0133
## 7 7 yes 5 0.0167
## 8 8 yes 4 0.0133
## 9 9 yes 1 0.00333
## 10 10 yes 4 0.0133
## # … with 14,275 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 14285 0.01 0.01 0.01 0.01 0 0 0.05 0.04 0.8 0.67 0
## [1] 4500000
me <- tinyData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 240 0.8
## 2 yes 60 0.2
## [1] "Margin of Error: 0.103279555898864"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 73 0.243
## 2 2 yes 69 0.23
## 3 3 yes 63 0.21
## 4 4 yes 62 0.207
## 5 5 yes 59 0.197
## 6 6 yes 66 0.22
## 7 7 yes 61 0.203
## 8 8 yes 58 0.193
## 9 9 yes 59 0.197
## 10 10 yes 65 0.217
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.2 0.02 0.2 0.2 0.02 0.11 0.28 0.17 0.09 -0.08 0
## [1] 4500000
me <- smallData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 180 0.6
## 2 yes 120 0.4
## [1] "Margin of Error: 0.0894427190999916"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 117 0.39
## 2 2 yes 120 0.4
## 3 3 yes 118 0.393
## 4 4 yes 115 0.383
## 5 5 yes 128 0.427
## 6 6 yes 111 0.37
## 7 7 yes 140 0.467
## 8 8 yes 125 0.417
## 9 9 yes 118 0.393
## 10 10 yes 119 0.397
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.4 0.03 0.4 0.4 0.03 0.3 0.52 0.22 0.06 0.02 0
## [1] 4500000
me <- mediumData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 120 0.4
## 2 yes 180 0.6
## [1] "Margin of Error: 0.0730296743340221"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 184 0.613
## 2 2 yes 171 0.57
## 3 3 yes 182 0.607
## 4 4 yes 193 0.643
## 5 5 yes 179 0.597
## 6 6 yes 176 0.587
## 7 7 yes 165 0.55
## 8 8 yes 176 0.587
## 9 9 yes 185 0.617
## 10 10 yes 181 0.603
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.6 0.03 0.6 0.6 0.03 0.49 0.71 0.22 -0.03 -0.15 0
## [1] 4500000
me <- largeData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 59 0.197
## 2 yes 240 0.803
## [1] "Margin of Error: 0.0513789013235358"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 228 0.763
## 2 2 yes 235 0.786
## 3 3 yes 248 0.829
## 4 4 yes 248 0.829
## 5 5 yes 242 0.809
## 6 6 yes 245 0.819
## 7 7 yes 240 0.803
## 8 8 yes 245 0.819
## 9 9 yes 245 0.819
## 10 10 yes 244 0.816
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.8 0.02 0.8 0.8 0.02 0.7 0.9 0.19 -0.08 0.01 0
## [1] 4485000
me <- massiveData %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 3 0.01
## 2 yes 297 0.99
## [1] "Margin of Error: 0.0115470053837925"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 299 0.997
## 2 2 yes 294 0.98
## 3 3 yes 298 0.993
## 4 4 yes 299 0.997
## 5 5 yes 298 0.993
## 6 6 yes 297 0.99
## 7 7 yes 295 0.983
## 8 8 yes 300 1
## 9 9 yes 298 0.993
## 10 10 yes 295 0.983
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.99 0.01 0.99 0.99 0 0.96 1 0.04 -0.56 0.34 0
## [1] 4500000
The margin of error decreases as n increases
tinyNumOfElements = 10
smallNumOfElements = 50
mediumNumOfElements = 100
largeNumOfElements = 1000
massiveNumOfElements = 10000
largeBinwidth = .1
mediumBinwidth = .02
tinyData = tibble(
text_ind = c(rep(yesLabel, tinyNumOfElements * proportion), rep(noLabel, tinyNumOfElements * (1 - proportion)))
)
smallData = tibble(
text_ind = c(rep(yesLabel, smallNumOfElements * proportion), rep(noLabel, smallNumOfElements * (1 - proportion)))
)
mediumData = tibble(
text_ind = c(rep(yesLabel, mediumNumOfElements * proportion), rep(noLabel, mediumNumOfElements * (1 - proportion)))
)
largeData = tibble(
text_ind = c(rep(yesLabel, largeNumOfElements * proportion), rep(noLabel, largeNumOfElements * (1 - proportion)))
)
massiveData = tibble(
text_ind = c(rep(yesLabel, massiveNumOfElements * proportion), rep(noLabel, massiveNumOfElements * (1 - proportion)))
)
me <- tinyData %>%
analyzeMarginOfErrorData(binwidth = largeBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 9 0.9
## 2 yes 1 0.1
## [1] "Margin of Error: 0.6"
## # A tibble: 9,721 × 4
## # Groups: replicate [9,721]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 1 0.1
## 2 4 yes 2 0.2
## 3 5 yes 1 0.1
## 4 6 yes 2 0.2
## 5 7 yes 1 0.1
## 6 8 yes 1 0.1
## 7 9 yes 1 0.1
## 8 10 yes 2 0.2
## 9 12 yes 1 0.1
## 10 13 yes 1 0.1
## # … with 9,711 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 9721 0.15 0.07 0.1 0.14 0 0.1 0.6 0.5 1.35 1.57 0
## [1] 150000
me <- smallData %>%
analyzeMarginOfErrorData(binwidth = mediumBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 45 0.9
## 2 yes 5 0.1
## [1] "Margin of Error: 0.268328157299975"
## # A tibble: 14,919 × 4
## # Groups: replicate [14,919]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 8 0.16
## 2 2 yes 5 0.1
## 3 3 yes 8 0.16
## 4 4 yes 5 0.1
## 5 5 yes 6 0.12
## 6 6 yes 5 0.1
## 7 7 yes 8 0.16
## 8 8 yes 7 0.14
## 9 9 yes 3 0.06
## 10 10 yes 6 0.12
## # … with 14,909 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 14919 0.1 0.04 0.1 0.1 0.03 0.02 0.3 0.28 0.4 0.03 0
## [1] 750000
me <- mediumData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 90 0.9
## 2 yes 10 0.1
## [1] "Margin of Error: 0.189736659610103"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 5 0.05
## 2 2 yes 4 0.04
## 3 3 yes 15 0.15
## 4 4 yes 4 0.04
## 5 5 yes 5 0.05
## 6 6 yes 13 0.13
## 7 7 yes 10 0.1
## 8 8 yes 12 0.12
## 9 9 yes 15 0.15
## 10 10 yes 4 0.04
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.1 0.03 0.1 0.1 0.03 0.01 0.24 0.23 0.26 0.03 0
## [1] 1500000
me <- largeData %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 900 0.9
## 2 yes 100 0.1
## [1] "Margin of Error: 0.06"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 96 0.096
## 2 2 yes 101 0.101
## 3 3 yes 83 0.083
## 4 4 yes 109 0.109
## 5 5 yes 101 0.101
## 6 6 yes 99 0.099
## 7 7 yes 101 0.101
## 8 8 yes 97 0.097
## 9 9 yes 92 0.092
## 10 10 yes 99 0.099
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.1 0.01 0.1 0.1 0.01 0.06 0.14 0.08 0.11 0.03 0
## [1] 15000000
me <- massiveData %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 9000 0.9
## 2 yes 1000 0.1
## [1] "Margin of Error: 0.0189736659610103"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 999 0.0999
## 2 2 yes 1001 0.100
## 3 3 yes 1029 0.103
## 4 4 yes 1036 0.104
## 5 5 yes 1008 0.101
## 6 6 yes 1007 0.101
## 7 7 yes 1019 0.102
## 8 8 yes 1048 0.105
## 9 9 yes 970 0.097
## 10 10 yes 964 0.0964
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.1 0 0.1 0.1 0 0.09 0.11 0.02 0.04 0.06 0
## [1] 150000000
Of those that sleep more than 10 hours, we can say with 95% confidence that about 27% of them work out every day. This is a significant relationship given that the only other prevalent pattern observed is 0 days at 31%.
Ho: Those who sleep 10+ hours per day are NOT more likely to strength train every day of the week Ha: Those who sleep 10+ hours per day are more likely to strength train every day of the week
type7 = "7"
type10Plus <- "10+"
binwidth = .0003
#exerciseEveryday <- yrbss %>%
# mutate(text_ind = ifelse(!is.na(strength_training_7d)
# & strength_training_7d == type7, yesLabel, noLabel))
#
#exerciseEverydayMarginOfError <- exerciseEveryday %>%
# analyzeMarginOfErrorData(binwidth = binwidth)
#
#exerciseEverydayConfidenceInerval <- exerciseEveryday %>%
# analyzeConfidenceInterval
#sleep10OrMoreHours <- yrbss %>%
# mutate(text_ind = ifelse(!is.na(school_night_hours_sleep)
# & school_night_hours_sleep == type10Plus, yesLabel, noLabel))
#
#sleep10OrMoreHoursMarginOfError <- sleep10OrMoreHours %>%
# analyzeMarginOfErrorData(binwidth = binwidth)
#
#sleep10OrMoreHoursConfidenceInerval <- sleep10OrMoreHours %>%
# analyzeConfidenceInterval
exerciseEverydayAndSleep10OrMoreHours <- yrbss %>%
filter(strength_training_7d == type7) %>%
mutate(text_ind = ifelse(!is.na(school_night_hours_sleep)
& school_night_hours_sleep == type10Plus, yesLabel, noLabel))
exerciseEverydayAndSleep10OrMoreHoursMarginOfError <- exerciseEverydayAndSleep10OrMoreHours %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 2001 0.960
## 2 yes 84 0.0403
## [1] "Margin of Error: 0.0429089098251224"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 92 0.0441
## 2 2 yes 74 0.0355
## 3 3 yes 91 0.0436
## 4 4 yes 84 0.0403
## 5 5 yes 80 0.0384
## 6 6 yes 82 0.0393
## 7 7 yes 85 0.0408
## 8 8 yes 94 0.0451
## 9 9 yes 78 0.0374
## 10 10 yes 92 0.0441
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.04 0 0.04 0.04 0 0.02 0.06 0.03 0.11 -0.03 0
## [1] 31275000
exerciseEverydayAndSleep10OrMoreHoursConfidenceInerval <- exerciseEverydayAndSleep10OrMoreHours %>%
analyzeConfidenceInterval## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0312 0.0489
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 0.04 0.01 0.04 0.04 0.01 0.03 0.05 0.02 0 -2.75 0.01
sleep10OrMoreHoursAndExerciseEveryday <- yrbss %>%
filter(school_night_hours_sleep == type10Plus) %>%
mutate(text_ind = ifelse(!is.na(strength_training_7d)
& strength_training_7d == type7, yesLabel, noLabel))
sleep10OrMoreHoursAndExerciseEverydayMarginOfError <- sleep10OrMoreHoursAndExerciseEveryday %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 232 0.734
## 2 yes 84 0.266
## [1] "Margin of Error: 0.0964021912134672"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 95 0.301
## 2 2 yes 88 0.278
## 3 3 yes 88 0.278
## 4 4 yes 97 0.307
## 5 5 yes 90 0.285
## 6 6 yes 83 0.263
## 7 7 yes 75 0.237
## 8 8 yes 87 0.275
## 9 9 yes 86 0.272
## 10 10 yes 91 0.288
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.27 0.02 0.27 0.27 0.02 0.16 0.36 0.2 0.02 -0.02 0
## [1] 4740000
sleep10OrMoreHoursAndExerciseEverydayConfidenceInerval <- sleep10OrMoreHoursAndExerciseEveryday %>%
analyzeConfidenceInterval## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.222 0.316
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 0.27 0.07 0.27 0.27 0.07 0.22 0.32 0.09 0 -2.75 0.05
sleep10OrMoreHoursAndExerciseEveryday %>%
select(strength_training_7d) %>%
count(strength_training_7d) %>%
mutate(p = n / sum(n)) %>%
select(strength_training_7d, n, p)## # A tibble: 9 × 3
## strength_training_7d n p
## <int> <int> <dbl>
## 1 0 100 0.316
## 2 1 17 0.0538
## 3 2 31 0.0981
## 4 3 31 0.0981
## 5 4 18 0.0570
## 6 5 23 0.0728
## 7 6 8 0.0253
## 8 7 84 0.266
## 9 NA 4 0.0127
The null hypothesis is rejected when the p value is less than or equal to the significance level, which in this case is .05, or 5%. It represents how often we might mistakenly accept the null hypothesis in this scenario
Using a proportion of 50% to get the highest possible sample size and a margin of error of 1%, our ideal sample size would be 9604
((1.96 ^ 2) * (0.5 * 0.5)) / 0.01 ^ 2 ## [1] 9604